rm(list=ls())
gc()
require(foreign)
require(ineq)
require(dplyr)
library(Hmisc)
require(corrplot)
require(cem)
require(BayesTree)
require(stargazer)
require(texreg)
require(systemfit)
require(SemiPar)
setwd(paste(substr(getwd(),1,regexpr("Dropbox",getwd())[1]-1),"Dropbox/coauthors/AMT_Bisbee_Larson/bisbee_larson_replication_final",sep=""))
source("./tools/leg2.R")
getwd()

###################################################################################
##
##  File Name: models_results.R
##
##  Input Files: final_data_clean.dta
##  Output Files: donate-simulations.RData        
##                donate-homo-simulations.RData
##                models_donate_pg_bivariate.pdf  (Figure 9)
##                models_donate_homopol_all.pdf   (Figure 10)
##
##  Purpose:  This file conducts the random cross-validation and creates the associated
##            figures (#9 and #10) presented in the paper. In addition, the script generates 
##            .RData files of the simulated results to save computing time. The function
##            defined at the top of the script implements the cross-validation simulations. 
##            Note that the cross-validations, by virtue of being random, may yield slightly
##            different results from the published paper, due only to the randomness of the 
##            sub-samples.
##
##############################################################################################

## Creating Functions
cv.func <- function(form,dat,model = "biv",split = "online",counter) {
  if(model == "sur") {
    dat2 <- dat[,which(grepl(paste(paste(unlist(sapply(1:17,function(x) as.character(form[[x]][3]))),collapse="|"),"|online|",as.character(form[[1]][2]),sep=""),colnames(dat)))]
    dat2 <- dat2[complete.cases(dat2),]
  } else {
    dat2 <- dat[,which(grepl(paste(gsub("\\+|\\~","|",as.character(form)),"|ipwbart|cem_weights|online",sep=""),colnames(dat)))]
    dat2 <- dat2[complete.cases(dat2),]
  }
  if(split == "online") {
    train <- dat2 %>% filter(online == 1)
    test <- dat2 %>% filter(online == 0)
  } else if(split == "online2") {
    ind <- sample(1:nrow(dat2),size = round(nrow(dat2)/2,0),replace = F)
    tmp <- dat2[ind,]
    train <- tmp %>% filter(online == 1)
    test <- tmp %>% filter(online == 0)
  } else if(split == "sim") {
    ind <- sample(1:nrow(dat2),size = round(nrow(dat2)/2,0),replace = F)
    train <- dat2[ind,]
    test <- dat2[-ind,]
  } else {
    tmp <- dat2[which(dat2$online == 0),]
    ind <- sample(1:nrow(tmp),size = round(nrow(tmp)/2,0),replace = F)
    train <- tmp[ind,]
    test <- tmp[-ind,]
  }
  if(model == "biv" | model == "controls") {
    w <- rep(1,nrow(train))
    mod <- lm(as.formula(form),data = train)
  } else if(model == "ipw") {
    w <- train$ipwbart
    mod <- lm(as.formula(form),dat = train,weights = w)
  } else if(model == "cem") {
    w <- train$cem_weights
    mod <- lm(as.formula(form),dat = train,weights = w)
  } else if(model == "sur") {
    w <- rep(1,nrow(train))
    mod <- systemfit(form,data=train,method = "SUR")
  }
  res <- predict(mod,newdata = test)
  if(model == "sur") {
    n <- nrow(res)
    
    ll <- mean(sapply(1:ncol(res),function(x) 0.5 * (sum(log(w)) - n * (log(2 * pi) + 1 - log(n) + log(sum(w * res[,x]^2,na.rm=T))))))
    df.ll <- mod$rank + 1
    bic <- -2 * ll + log(n) * df.ll
    
    bias <- mean(sapply(1:ncol(res),function(x) mean(res[,x],na.rm=T)))
    biassq <- mean(sapply(1:ncol(res),function(x) mean(res[,x]^2,na.rm=T)))
    rmse <- sqrt(biassq)
    sd <- mean(sapply(1:ncol(res),function(x) sd(res[,x],na.rm=T)))
  } else {
    n <- length(res)
    
    ll <- 0.5 * (sum(log(w) * is.finite(log(w)),na.rm=T) - n * (log(2 * pi) + 1 - log(n) + log(sum(w * res^2,na.rm=T))))
    df.ll <- mod$rank + 1
    bic <- -2 * ll + log(n) * df.ll
    
    bias <- mean(res,na.rm=T)
    biassq <- mean(res^2,na.rm=T)
    rmse <- sqrt(biassq)
    sd <- sd(res,na.rm=T)
  }
  #cat(paste(round(counter/1000,2),"\n"))
  return(list(bic = bic,bias = bias,biassq = biassq,rmse = rmse,sd = sd))
}

dat <- read.dta("./tools/final_data_clean.dta")

###############################################################################################
# Cross validation analysis: preparing the data
###############################################################################################
ties <- c("clique","weakest","weak","strong","strongest")
mod.results <- list(NA)
j = 1
for(mods in c("biv","controls","ipw","cem")) {#},"sur")) {
  i = 1
  raw.sims <- list(NA)
  for(tie in ties) {
    if(mods == "biv" | mods == "ipw" | mods == "cem") {
      form <- paste("std_donate_",tie,"~std_personalgain_",tie,sep="")
    } else if(mods == "controls") {
      form <- paste("std_donate_",tie,"~std_personalgain_",tie,"+",paste(colnames(dat)[which(grepl("^dem2_",colnames(dat)))],collapse="+"),sep="")
    } else if(mods == "sur") {
      controls <- colnames(dat)[which(grepl(tie,colnames(dat)))][which(grepl("^std_",colnames(dat)[which(grepl(tie,colnames(dat)))]))]
      controls <- controls[-which(grepl("std_tie_synth|std_donate_",controls))]
      if(tie == "weak" | tie == "strong") {
        controls <- controls[-which(grepl("est",controls))]
      }
      form <- as.list(paste("std_donate_",tie,"~",controls,sep=""))
      form <- sapply(1:17,function(x) as.formula(form[[x]][1]))
    }
    raw.sims[[i]] <- sapply(1:1000,function(x) cv.func(form,dat,model=mods,split="sim",counter = x))
    i = i+1
  }
  mod.results[[j]] <- raw.sims
  j = j+1
}

save(mod.results,file = "./models/donate-simulations.RData")

ties <- c("clique","weakest","weak","strong","strongest")
homo.results <- list(NA)
j = 1
for(mods in c("biv","controls","ipw","cem")) {#},"sur")) {
  i = 1
  raw.sims <- list(NA)
  for(tie in ties) {
    if(mods == "biv" | mods == "ipw" | mods == "cem") {
      form <- paste("std_donate_",tie,"~std_homo_pol_",tie,sep="")
    } else if(mods == "controls") {
      form <- paste("std_donate_",tie,"~std_homo_pol_",tie,"+",paste(colnames(dat)[which(grepl("^dem2_",colnames(dat)))],collapse="+"),sep="")
    } else if(mods == "sur") {
      controls <- colnames(dat)[which(grepl(tie,colnames(dat)))][which(grepl("^std_",colnames(dat)[which(grepl(tie,colnames(dat)))]))]
      controls <- controls[-which(grepl("std_tie_synth|std_donate_",controls))]
      if(tie == "weak" | tie == "strong") {
        controls <- controls[-which(grepl("est",controls))]
      }
      tmp <- as.list(paste("std_donate_",tie,"~",controls,sep=""))
      form <- sapply(1:17,function(x) as.formula(tmp[[x]][1]))
    }
    raw.sims[[i]] <- sapply(1:1000,function(x) cv.func(form,dat,model=mods,split="sim",counter = x))
    i = i+1
  }
  homo.results[[j]] <- raw.sims
  j = j+1
}

save(homo.results,file = "./models/donate-homo-simulations.RData")




###############################################################################################
# models_donate_pg_bivariate.pdf: Figure 9
###############################################################################################
load("./models/donate-simulations.RData")

y.axis <- 1:5

ties <- c("clique","weakest","weak","strong","strongest")
min.1 <- -5
max.1 <- 4
simpleCap <- function(x) {
  s <- strsplit(x, " ")[[1]]
  paste(toupper(substring(s, 1,1)), substring(s, 2),
        sep="", collapse=" ")
}
set.seed(123)
min.1 <- -3
max.1 <- 3
pdf("./1_Figures/models_donate_pg_bivariate.pdf",height=7,width=7)
par(xpd=F)
plot(rep(0,5),y.axis,type='n',xlab="",ylab="",yaxt='n',
     xlim=c(min.1,max.1),bty='n',ylim=c(1,6),xaxt='n',main="Bias and Variance in Out-of-Sample Predictions")
mtext(capitalize(ties),2,at=y.axis,las=1,cex=.7,adj=0,padj=-.5)
segments(seq(min.1+1,max.1-1,by=1),1,seq(min.1+1,max.1-1,by=1),5.75,col="grey80")
mtext(seq(min.1+1,max.1-1,by=2),3,at=seq(min.1+1,max.1-1,by=2),cex=.7,col="grey60",line=-1.5)
mtext("% of Sims > On/Off",3,at=2.7,line=-2.5,cex=.7)
i = 1
for(tie in ties) {
  sims <- sapply(1:1000, function(x) rnorm(100,mod.results[[1]][[i]][,x]$bias,mod.results[[1]][[i]][,x]$sd))
  bic.sims <- sapply(1:1000, function(x) mod.results[[1]][[i]][,x]$bic)
  rmse.sims <- sapply(1:1000,function(x) mod.results[[1]][[i]][,x]$rmse)
  absbias.sims <- sapply(1:1000,function(x) abs(mod.results[[1]][[i]][,x]$bias))
  
  x.full <- density(sims)$x
  raw.full <- density(sims)$y
  rescale.full <- (raw.full-min(raw.full))/(max(raw.full)-min(raw.full))/2.5
  hx.full <- c(rescale.full+i)
  hgt <- max(rescale.full)
  
  polygon(x.full,hx.full, col=rgb(0,0,0,.2), border = rgb(0,0,0,.2),lwd=1)
  
  raw.onoff <- cv.func(paste("std_donate_",tie,"~std_personalgain_",tie,sep=""),dat,model="biv",split="online",counter=1)
  onoff <- rnorm(100000,raw.onoff$bias,raw.onoff$sd)
  bic.onoff <- raw.onoff$bic
  rmse.onoff <- raw.onoff$rmse
  absbias.onoff <- abs(raw.onoff$bias)
  
  x.full <- density(onoff)$x
  raw.full <- density(onoff)$y
  rescale.full <- (raw.full-min(raw.full))/(max(raw.full)-min(raw.full))/2.5
  hx.full <- c(rescale.full+i)
  hgt <- max(rescale.full)

  polygon(x.full,hx.full, col=rgb(1,1,1,.2), lty=2)
  
  bic.test <- mean((bic.sims - bic.onoff)>2)
  rmse.test <- mean(rmse.sims > rmse.onoff)
  absbias.test <- mean(absbias.sims > absbias.onoff)
  stars.bic <- ifelse(bic.test <= .01 | bic.test >= .99,"***",ifelse(bic.test<=.05 | bic.test >= .95,"**",ifelse(bic.test<=.1 | bic.test >= .9,"*","")))
  cols.bic <- ifelse(bic.test <= .1 | bic.test >= .9,rgb(0,0,0,.7),rgb(0,0,0,.2))
  stars.rmse <- ifelse(rmse.test <= .01 | rmse.test >= .99,"***",ifelse(rmse.test<=.05 | rmse.test >= .95,"**",ifelse(rmse.test<=.1 | rmse.test >= .9,"*","")))
  cols.rmse <- ifelse(rmse.test <= .1 | rmse.test >= .9,rgb(0,0,0,.7),rgb(0,0,0,.2))
  stars.absbias <- ifelse(absbias.test <= .01 | absbias.test >= .99,"***",ifelse(absbias.test<=.05 | absbias.test >= .95,"**",ifelse(absbias.test<=.1 | absbias.test >= .9,"*","")))
  cols.absbias <- ifelse(absbias.test <= .1 | absbias.test >= .9,rgb(0,0,0,.7),rgb(0,0,0,.2))
  
  text(2.2,i+hgt+.1,cex = .7,adj= 0,labels = paste("BIC: ",round(bic.test,2),stars.bic,sep=""),col="black")
  text(2.2,i+hgt,cex = .7,adj = 0,labels = paste("RMSE: ",round(rmse.test,2),stars.rmse,sep=""),col="black")
  text(2.2,i+hgt-.1,cex = .7,adj = 0,labels = paste("Abs. Bias: ",round(absbias.test,2),stars.absbias,sep=""),col="black")
   
  segments(raw.onoff$bias,i,raw.onoff$bias,i+hgt,lty=2)
  segments(mean(sims),i,mean(sims),i+hgt,lty=1)
  i = i+1
}
abline(h=y.axis)
par(xpd=T)
leg2("center",legend=c("Online / Offline Error","Random CV Error"),fill.lty=c(2,0),
     inset=c(0,-.15),horiz=T,
     fill = c("white",rgb(168,168,168,max=255,alpha=250)),cex=.9,bty='n',pt.cex=1.3,box.col="grey80",bg="white")
dev.off()


###############################################################################################
# models_donate_homopol_all.pdf: Figure 10
###############################################################################################
load("./models/donate-homo-simulations.RData")

ties <- c("clique","weakest","weak","strong","strongest")
  y.axis <- 1:5
set.seed(1)
  pdf(paste0("./1_Figures/models_donate_homopol_all.pdf"),height=5,width = 7)
  par(xpd=F)
  nf <- layout(matrix(c(1,2,3,4,5,6,6,6,6,6),2,5,byrow=T),widths=c(.4,1.4,1.4,1.4,1.4),heights=c(6,1))
  #layout.show(nf)
  par(mar=c(0,2,3,0))
  plot(rep(0,5),y.axis,type='n',xlab="",ylab="",yaxt='n',
       bty='n',ylim=c(1,6),xaxt='n',main="")
  mtext(capitalize(ties),2,at=y.axis,las=1,cex=.7,adj=0,padj=-.5)
  abline(h=y.axis)
  par(xpd=T)
  j = 1
  for (mods in c("biv","controls","ipw","cem")) { #,"sur")) {
    min.1 <- -1
    max.1 <- 1
    mains <- ifelse(mods == "biv","Bivariate",ifelse(mods == "controls","Controls",toupper(mods)))
    plot(rep(0,5),y.axis,type='n',xlab="",ylab="",yaxt='n',
         xlim=c(min.1,max.1),bty='n',ylim=c(1,6),xaxt='n',main=mains)
    i = 1
    for(tie in ties) {
      gap <- (max.1 - min.1)/5
      sims <- sapply(1:1000, function(x) rnorm(10,homo.results[[j]][[i]][,x]$bias,homo.results[[j]][[i]][,x]$sd))
      bic.sims <- sapply(1:1000, function(x) homo.results[[j]][[i]][,x]$bic)
      rmse.sims <- sapply(1:1000,function(x) homo.results[[j]][[i]][,x]$rmse)
      absbias.sims <- sapply(1:1000,function(x) abs(homo.results[[j]][[i]][,x]$bias))
      
      x.full <- density(sims)$x
      raw.full <- density(sims)$y
      rescale.full <- (raw.full-min(raw.full))/(max(raw.full)-min(raw.full))/2.5
      hx.full <- c(rescale.full+i)
      hgt <- max(rescale.full)
      
      polygon(x.full,hx.full, col=rgb(0,0,0,.2), border = rgb(0,0,0,.2),lwd=1)
      
      if(mods == "biv" | mods == "ipw" | mods == "cem") {
        form <- paste("std_donate_",tie,"~std_homo_pol_",tie,sep="")
      } else if(mods == "controls") {
        form <- paste("std_donate_",tie,"~std_homo_pol_",tie,"+",paste(colnames(dat)[which(grepl("^dem2_",colnames(dat)))],collapse="+"),sep="")
      } else if(mods == "sur") {
        controls <- colnames(dat)[which(grepl(tie,colnames(dat)))][which(grepl("^std_",colnames(dat)[which(grepl(tie,colnames(dat)))]))]
        controls <- controls[-which(grepl("std_tie_synth|std_donate_",controls))]
        if(tie == "weak" | tie == "strong") {
          controls <- controls[-which(grepl("est",controls))]
        }
        tmp <- as.list(paste("std_donate_",tie,"~",controls,sep=""))
        form <- sapply(1:17,function(x) as.formula(tmp[[x]][1]))
      }
      raw.onoff <- cv.func(form,dat,model=mods,split="online",counter = 1)
      onoff <- rnorm(10000,raw.onoff$bias,raw.onoff$sd)
      bic.onoff <- raw.onoff$bic
      rmse.onoff <- raw.onoff$rmse
      absbias.onoff <- abs(raw.onoff$bias)
      
      x.full <- density(onoff)$x
      raw.full <- density(onoff)$y
      rescale.full <- (raw.full-min(raw.full))/(max(raw.full)-min(raw.full))/2.5
      hx.full <- c(rescale.full+i)
      hgt <- max(rescale.full)
      
      polygon(x.full,hx.full, col=rgb(1,1,1,.2), lty=2)
      
      bic.test <- mean((bic.sims - bic.onoff)>2)
      rmse.test <- mean(rmse.sims > rmse.onoff)
      absbias.test <- mean(absbias.sims > absbias.onoff)
      
      stars.rmse <- ifelse(rmse.test <= .01 | rmse.test >= .99,"***",ifelse(rmse.test <= .05 | rmse.test >= .95,"**",ifelse(rmse.test <= .1 | rmse.test >= .9,"*","")))
      cols.rmse <- ifelse(rmse.test <=.1 | rmse.test >=.9,rgb(0,0,0,.7),rgb(0,0,0,.2))
      stars.absbias <- ifelse(absbias.test <=.01 | absbias.test >=.99,"***",ifelse(absbias.test <=.05 | absbias.test >=.95,"**",ifelse(absbias.test <=.1 | absbias.test >=.9,"*","")))
      cols.absbias <- ifelse(absbias.test <=.1 | absbias.test >=.9,rgb(0,0,0,.7),rgb(0,0,0,.2))
      stars.bic <- ifelse(bic.test <=.01 | bic.test >=.99,"***",ifelse(bic.test <=.05 | bic.test >=.95,"**",ifelse(bic.test <=.1 | bic.test >=.9,"*","")))
      cols.bic <- ifelse(bic.test <=.1 | bic.test >=.9,rgb(0,0,0,.7),rgb(0,0,0,.2))
      
      text(0,i+hgt+.1,cex = .7,adj = 0,labels = paste("BIC: ",round(bic.test,2),stars.bic,sep=""),pos = 4,offset = 1,col = "black")
      text(0,i+hgt,cex = .7,adj = 0,labels = paste("RMSE: ",round(rmse.test,2),stars.rmse,sep=""),pos = 4,offset = 1,col = "black")
      #text(0,i+hgt-.1,cex = .7,adj = 0,labels = paste("Abs. Bias: ",round(absbias.test,2),stars.absbias,sep=""),pos = 4,offset = 1,col = cols.absbias)
      # 
      segments(raw.onoff$bias,i,raw.onoff$bias,i+hgt,lty=2)
      segments(mean(sims),i,mean(sims),i+hgt,lty=1)
      i = i+1
    }
    abline(h=y.axis)
    segments(0,1,0,5.75,col=rgb(0,0,0,.2))
    text(0,6,labels = 0,col=rgb(0,0,0,.2))
    j = j+1
  }
  par(xpd=T)
  par(mar=c(0,0,0,0))
  plot(0,0,type='n',xlab="",ylab="",xaxt='n',yaxt='n',bty='n')
  leg2("center",legend=c("Online / Offline Error","Random CV Error"),fill.lty=c(2,0),
       inset=c(0,-.15),horiz=T,
       fill = c("white",rgb(168,168,168,max=255,alpha=250)),cex=1,bty='n',pt.cex=1.3,box.col="grey80",bg="white")
  dev.off()
  